{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "0bec7173",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import soundfile as sf\n",
    "import os\n",
    "from tqdm import tqdm\n",
    "from multiprocess import Pool\n",
    "import itertools\n",
    "\n",
    "def chunks(l, n):\n",
    "    for i in range(0, len(l), n):\n",
    "        yield (l[i: i + n], i // n)\n",
    "\n",
    "def multiprocessing(strings, function, cores=6, returned=True):\n",
    "    df_split = chunks(strings, len(strings) // cores)\n",
    "    pool = Pool(cores)\n",
    "    pooled = pool.map(function, df_split)\n",
    "    pool.close()\n",
    "    pool.join()\n",
    "\n",
    "    if returned:\n",
    "        return list(itertools.chain(*pooled))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "bbba08ae",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>primary_label</th>\n",
       "      <th>secondary_labels</th>\n",
       "      <th>type</th>\n",
       "      <th>latitude</th>\n",
       "      <th>longitude</th>\n",
       "      <th>scientific_name</th>\n",
       "      <th>common_name</th>\n",
       "      <th>author</th>\n",
       "      <th>date</th>\n",
       "      <th>filename</th>\n",
       "      <th>license</th>\n",
       "      <th>rating</th>\n",
       "      <th>time</th>\n",
       "      <th>url</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>acafly</td>\n",
       "      <td>['amegfi']</td>\n",
       "      <td>['begging call', 'call', 'juvenile']</td>\n",
       "      <td>35.3860</td>\n",
       "      <td>-84.1250</td>\n",
       "      <td>Empidonax virescens</td>\n",
       "      <td>Acadian Flycatcher</td>\n",
       "      <td>Mike Nelson</td>\n",
       "      <td>2012-08-12</td>\n",
       "      <td>XC109605.ogg</td>\n",
       "      <td>Creative Commons Attribution-NonCommercial-Sha...</td>\n",
       "      <td>2.5</td>\n",
       "      <td>09:30</td>\n",
       "      <td>https://www.xeno-canto.org/109605</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>acafly</td>\n",
       "      <td>[]</td>\n",
       "      <td>['call']</td>\n",
       "      <td>9.1334</td>\n",
       "      <td>-79.6501</td>\n",
       "      <td>Empidonax virescens</td>\n",
       "      <td>Acadian Flycatcher</td>\n",
       "      <td>Allen T. Chartier</td>\n",
       "      <td>2000-12-26</td>\n",
       "      <td>XC11209.ogg</td>\n",
       "      <td>Creative Commons Attribution-NonCommercial-Sha...</td>\n",
       "      <td>3.0</td>\n",
       "      <td>?</td>\n",
       "      <td>https://www.xeno-canto.org/11209</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>acafly</td>\n",
       "      <td>[]</td>\n",
       "      <td>['call']</td>\n",
       "      <td>5.7813</td>\n",
       "      <td>-75.7452</td>\n",
       "      <td>Empidonax virescens</td>\n",
       "      <td>Acadian Flycatcher</td>\n",
       "      <td>Sergio Chaparro-Herrera</td>\n",
       "      <td>2012-01-10</td>\n",
       "      <td>XC127032.ogg</td>\n",
       "      <td>Creative Commons Attribution-NonCommercial-Sha...</td>\n",
       "      <td>3.0</td>\n",
       "      <td>15:20</td>\n",
       "      <td>https://www.xeno-canto.org/127032</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>acafly</td>\n",
       "      <td>['whwbec1']</td>\n",
       "      <td>['call']</td>\n",
       "      <td>4.6717</td>\n",
       "      <td>-75.6283</td>\n",
       "      <td>Empidonax virescens</td>\n",
       "      <td>Acadian Flycatcher</td>\n",
       "      <td>Oscar Humberto Marin-Gomez</td>\n",
       "      <td>2009-06-19</td>\n",
       "      <td>XC129974.ogg</td>\n",
       "      <td>Creative Commons Attribution-NonCommercial-Sha...</td>\n",
       "      <td>3.5</td>\n",
       "      <td>07:50</td>\n",
       "      <td>https://www.xeno-canto.org/129974</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>acafly</td>\n",
       "      <td>['whwbec1']</td>\n",
       "      <td>['call']</td>\n",
       "      <td>4.6717</td>\n",
       "      <td>-75.6283</td>\n",
       "      <td>Empidonax virescens</td>\n",
       "      <td>Acadian Flycatcher</td>\n",
       "      <td>Oscar Humberto Marin-Gomez</td>\n",
       "      <td>2009-06-19</td>\n",
       "      <td>XC129981.ogg</td>\n",
       "      <td>Creative Commons Attribution-NonCommercial-Sha...</td>\n",
       "      <td>3.5</td>\n",
       "      <td>07:50</td>\n",
       "      <td>https://www.xeno-canto.org/129981</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  primary_label secondary_labels                                  type  \\\n",
       "0        acafly       ['amegfi']  ['begging call', 'call', 'juvenile']   \n",
       "1        acafly               []                              ['call']   \n",
       "2        acafly               []                              ['call']   \n",
       "3        acafly      ['whwbec1']                              ['call']   \n",
       "4        acafly      ['whwbec1']                              ['call']   \n",
       "\n",
       "   latitude  longitude      scientific_name         common_name  \\\n",
       "0   35.3860   -84.1250  Empidonax virescens  Acadian Flycatcher   \n",
       "1    9.1334   -79.6501  Empidonax virescens  Acadian Flycatcher   \n",
       "2    5.7813   -75.7452  Empidonax virescens  Acadian Flycatcher   \n",
       "3    4.6717   -75.6283  Empidonax virescens  Acadian Flycatcher   \n",
       "4    4.6717   -75.6283  Empidonax virescens  Acadian Flycatcher   \n",
       "\n",
       "                       author        date      filename  \\\n",
       "0                 Mike Nelson  2012-08-12  XC109605.ogg   \n",
       "1           Allen T. Chartier  2000-12-26   XC11209.ogg   \n",
       "2     Sergio Chaparro-Herrera  2012-01-10  XC127032.ogg   \n",
       "3  Oscar Humberto Marin-Gomez  2009-06-19  XC129974.ogg   \n",
       "4  Oscar Humberto Marin-Gomez  2009-06-19  XC129981.ogg   \n",
       "\n",
       "                                             license  rating   time  \\\n",
       "0  Creative Commons Attribution-NonCommercial-Sha...     2.5  09:30   \n",
       "1  Creative Commons Attribution-NonCommercial-Sha...     3.0      ?   \n",
       "2  Creative Commons Attribution-NonCommercial-Sha...     3.0  15:20   \n",
       "3  Creative Commons Attribution-NonCommercial-Sha...     3.5  07:50   \n",
       "4  Creative Commons Attribution-NonCommercial-Sha...     3.5  07:50   \n",
       "\n",
       "                                 url  \n",
       "0  https://www.xeno-canto.org/109605  \n",
       "1   https://www.xeno-canto.org/11209  \n",
       "2  https://www.xeno-canto.org/127032  \n",
       "3  https://www.xeno-canto.org/129974  \n",
       "4  https://www.xeno-canto.org/129981  "
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.read_csv('train_metadata.csv')\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "89a659cf",
   "metadata": {},
   "outputs": [],
   "source": [
    "def loop(indices):\n",
    "    indices, _ = indices\n",
    "    df = pd.read_csv('train_metadata.csv')\n",
    "    data = []\n",
    "    for i in tqdm(indices):\n",
    "        audio_filename = os.path.join('birdsound', df['primary_label'].iloc[i], df['filename'].iloc[i])\n",
    "        if not os.path.exists(audio_filename):\n",
    "            continue\n",
    "        \n",
    "        y, sr = sf.read(audio_filename)\n",
    "        if (len(y) / sr) >= 30:\n",
    "            continue\n",
    "            \n",
    "        d = df.iloc[i].to_dict()\n",
    "        d['len'] = len(y) / sr\n",
    "        d['audio_filename'] = audio_filename\n",
    "        \n",
    "        data.append(d)\n",
    "    return data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "03757c96",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|███████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 26.63it/s]\n"
     ]
    }
   ],
   "source": [
    "processed = loop((list(range(10)), 0))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "f1a68348",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|███████████████████████████████████████████████████████████████████████████████████| 3143/3143 [02:09<00:00, 24.36it/s]\n",
      "100%|███████████████████████████████████████████████████████████████████████████████████████| 14/14 [00:00<00:00, 21.76it/s]\n",
      "100%|███████████████████████████████████████████████████████████████████████████████████| 3143/3143 [02:15<00:00, 23.26it/s]\n",
      "100%|███████████████████████████████████████████████████████████████████████████████████| 3143/3143 [02:16<00:00, 23.05it/s]\n",
      "100%|███████████████████████████████████████████████████████████████████████████████████| 3143/3143 [02:21<00:00, 22.26it/s]\n",
      "100%|███████████████████████████████████████████████████████████████████████████████████| 3143/3143 [02:21<00:00, 22.22it/s]\n",
      "100%|███████████████████████████████████████████████████████████████████████████████████| 3143/3143 [02:22<00:00, 22.08it/s]\n",
      "100%|███████████████████████████████████████████████████████████████████████████████████| 3143/3143 [02:23<00:00, 21.92it/s]\n",
      "100%|███████████████████████████████████████████████████████████████████████████████████| 3143/3143 [02:27<00:00, 21.32it/s]\n",
      "100%|███████████████████████████████████████████████████████████████████████████████████| 3143/3143 [02:27<00:00, 21.24it/s]\n",
      "100%|███████████████████████████████████████████████████████████████████████████████████| 3143/3143 [02:28<00:00, 21.12it/s]\n",
      "100%|███████████████████████████████████████████████████████████████████████████████████| 3143/3143 [02:28<00:00, 21.11it/s]\n",
      "100%|███████████████████████████████████████████████████████████████████████████████████| 3143/3143 [02:29<00:00, 21.09it/s]\n",
      "100%|███████████████████████████████████████████████████████████████████████████████████| 3143/3143 [02:29<00:00, 20.99it/s]\n",
      "100%|███████████████████████████████████████████████████████████████████████████████████| 3143/3143 [02:31<00:00, 20.71it/s]\n",
      "100%|███████████████████████████████████████████████████████████████████████████████████| 3143/3143 [02:35<00:00, 20.25it/s]\n",
      "100%|███████████████████████████████████████████████████████████████████████████████████| 3143/3143 [02:36<00:00, 20.03it/s]\n",
      "100%|███████████████████████████████████████████████████████████████████████████████████| 3143/3143 [02:40<00:00, 19.63it/s]\n",
      "100%|███████████████████████████████████████████████████████████████████████████████████| 3143/3143 [02:46<00:00, 18.87it/s]\n",
      "100%|███████████████████████████████████████████████████████████████████████████████████| 3143/3143 [02:51<00:00, 18.34it/s]\n",
      "100%|███████████████████████████████████████████████████████████████████████████████████| 3143/3143 [03:00<00:00, 17.37it/s]\n"
     ]
    }
   ],
   "source": [
    "processed = multiprocessing(list(range(len(df))), loop, cores = 20)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "59967797",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "27740"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(processed)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "b4ce1c9d",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "\n",
    "with open('BirdCLEF-2021.json', 'w') as fopen:\n",
    "    json.dump(processed, fopen)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
